#Loading Libraries

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ✔ purrr   1.0.0      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lmboot)
## Warning: package 'lmboot' was built under R version 4.2.3
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(naniar)
library(utils)
library(stats)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:boot':
## 
##     logit
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.2.3
## 
## Attaching package: 'olsrr'
## 
## The following object is masked from 'package:datasets':
## 
##     rivers

##Reading in Dataset

life = read.csv('https://github.com/athibeaux/MSDS-DS6372/raw/main/Life_Expectancy.csv')
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).

##Upon looking at the graph of the original data set, it appears that there needs to be a log transformation on the X or the GDP as we are interested in seeing the relation between Life Expenctancy and GDP.

##Checking Data Types

str(life)
## 'data.frame':    2938 obs. of  22 variables:
##  $ Country                        : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : int  2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
##  $ Status                         : chr  "Developing" "Developing" "Developing" "Developing" ...
##  $ Life.expectancy                : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality                : int  263 271 268 272 275 279 281 287 295 295 ...
##  $ infant.deaths                  : int  62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure         : num  71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis.B                    : int  65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : int  1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
##  $ BMI                            : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths              : int  83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : int  6 58 62 67 68 66 63 64 63 58 ...
##  $ Total.expenditure              : num  8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : int  65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num  584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num  33736494 327582 31731688 3696958 2978599 ...
##  $ thinness..1.19.years           : num  17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness.5.9.years             : num  17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income.composition.of.resources: num  0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life)
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `gather()` instead.
## ℹ The deprecated feature was likely used in the visdat package.
##   Please report the issue at <]8;;https://github.com/ropensci/visdat/issueshttps://github.com/ropensci/visdat/issues]8;;>.

dim(life)
## [1] 2938   22

#Imputing using Median

#GDP 15% [17]  keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.  

#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:3, 9, 17 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938   22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#Literature says that over 10% missing data can contribute to bias 
#HepatitsB [9] at 19% , Population 22% [18]. 
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

Splitting the data

set.seed(1234)
trainIndex<-createDataPartition(cleandataMedian$Life.expectancy,p=.8,list=F)  #p: proportion of data in train

training<-cleandataMedian[trainIndex,]
validate<-cleandataMedian[-trainIndex,]

Before and after log transforming GDP, with cleandataMedian:

ggplot(data = training) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))

#Log transformation on GDP 
ggplot(data = training) + geom_point(mapping = aes(x = log(GDP), y = Life.expectancy))

##Correlation Matrix:

cor <- cor(training[,c(4,5:20)])
corrplot(cor, method = "square",  tl.srt = 50, tl.col = "black", tl.cex = 0.6, title = "Correlation of Variables", mar=c(0,0,1,0))

GGPairs:

#commented out for knitting
#ggpairs(training[,4:20])

Create categorical variable from year:

summary(training$Year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2000    2004    2008    2008    2011    2015
#Labeled the breaks in years into 2 year buckets
training$DualYears <- cut(as.numeric(training$Year), breaks=c(0, 2001, 2003, 2005, 2007, 2009, 2011, 2013, 2015), labels=c('2000-2001', '2002-2003', '2004-2005', '2006-2007', '2008-2009', '2010-2011','2012-2013','2014-2015'))

training %>% ggplot() + geom_bar(aes(x = DualYears))

summary(training$DualYears)
## 2000-2001 2002-2003 2004-2005 2006-2007 2008-2009 2010-2011 2012-2013 2014-2015 
##       278       298       302       288       302       296       291       289

Full Model

KitchenSink <- lm(Life.expectancy~Year + Status + Adult.Mortality +
                  infant.deaths + Alcohol + percentage.expenditure +
                  Measles + BMI + under.five.deaths + Polio + 
                  Total.expenditure + Diphtheria + HIV.AIDS + log(GDP) + 
                    thinness..1.19.years + thinness.5.9.years +
                    Income.composition.of.resources + Schooling, training)
plot(KitchenSink)

vif(KitchenSink)
##                            Year                          Status 
##                        1.147699                        1.844672 
##                 Adult.Mortality                   infant.deaths 
##                        1.791904                      170.003056 
##                         Alcohol          percentage.expenditure 
##                        1.908148                        1.669605 
##                         Measles                             BMI 
##                        1.416355                        1.760019 
##               under.five.deaths                           Polio 
##                      172.685555                        1.993205 
##               Total.expenditure                      Diphtheria 
##                        1.204580                        2.033602 
##                        HIV.AIDS                        log(GDP) 
##                        1.486816                        2.081129 
##            thinness..1.19.years              thinness.5.9.years 
##                        8.502136                        8.625215 
## Income.composition.of.resources                       Schooling 
##                        3.206133                        3.559931
full.model <- lm(Life.expectancy~ Status + Alcohol + percentage.expenditure + 
                   Measles + BMI + under.five.deaths + Polio + 
                   Total.expenditure + Diphtheria + HIV.AIDS + log(GDP) + 
                   thinness..1.19.years +
                   Income.composition.of.resources + Schooling, training)

summary(full.model)
## 
## Call:
## lm(formula = Life.expectancy ~ Status + Alcohol + percentage.expenditure + 
##     Measles + BMI + under.five.deaths + Polio + Total.expenditure + 
##     Diphtheria + HIV.AIDS + log(GDP) + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.4131  -2.6858   0.1732   2.7454  19.5163 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      4.660e+01  8.279e-01  56.283  < 2e-16 ***
## StatusDeveloping                -2.284e+00  3.362e-01  -6.794 1.38e-11 ***
## Alcohol                         -5.985e-02  3.282e-02  -1.824   0.0683 .  
## percentage.expenditure           1.527e-04  6.299e-05   2.425   0.0154 *  
## Measles                         -9.719e-06  9.525e-06  -1.020   0.3077    
## BMI                              5.473e-02  6.262e-03   8.740  < 2e-16 ***
## under.five.deaths               -9.013e-04  7.723e-04  -1.167   0.2433    
## Polio                            2.864e-02  5.610e-03   5.106 3.56e-07 ***
## Total.expenditure                1.013e-01  4.285e-02   2.365   0.0181 *  
## Diphtheria                       4.782e-02  5.615e-03   8.515  < 2e-16 ***
## HIV.AIDS                        -6.789e-01  1.976e-02 -34.355  < 2e-16 ***
## log(GDP)                         4.448e-01  7.848e-02   5.668 1.62e-08 ***
## thinness..1.19.years            -6.931e-02  3.015e-02  -2.299   0.0216 *  
## Income.composition.of.resources  7.455e+00  8.198e-01   9.094  < 2e-16 ***
## Schooling                        7.672e-01  5.482e-02  13.994  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.583 on 2329 degrees of freedom
## Multiple R-squared:  0.7687, Adjusted R-squared:  0.7673 
## F-statistic: 552.9 on 14 and 2329 DF,  p-value: < 2.2e-16
plot(full.model)

vif(full.model)
##                          Status                         Alcohol 
##                        1.816727                        1.809490 
##          percentage.expenditure                         Measles 
##                        1.659251                        1.394714 
##                             BMI               under.five.deaths 
##                        1.717403                        1.728815 
##                           Polio               Total.expenditure 
##                        1.984437                        1.188715 
##                      Diphtheria                        HIV.AIDS 
##                        1.996048                        1.131097 
##                        log(GDP)            thinness..1.19.years 
##                        2.054859                        1.956459 
## Income.composition.of.resources                       Schooling 
##                        3.102486                        3.510213

Fitting a preliminary model of non-redundant variables

prelim.model <- lm(Life.expectancy~Alcohol + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years +
              Income.composition.of.resources + Schooling, training)

summary(prelim.model)
## 
## Call:
## lm(formula = Life.expectancy ~ Alcohol + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + log(GDP) + 
##     thinness..1.19.years + Income.composition.of.resources + 
##     Schooling, data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.8833  -2.5900   0.1524   2.6914  19.4391 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      4.256e+01  6.593e-01  64.552  < 2e-16 ***
## Alcohol                          3.571e-02  3.074e-02   1.162 0.245540    
## Measles                         -1.125e-05  9.646e-06  -1.166 0.243721    
## BMI                              5.243e-02  6.327e-03   8.287  < 2e-16 ***
## under.five.deaths               -6.712e-04  7.815e-04  -0.859 0.390553    
## Polio                            2.883e-02  5.682e-03   5.075 4.19e-07 ***
## Total.expenditure                1.532e-01  4.289e-02   3.571 0.000362 ***
## Diphtheria                       4.710e-02  5.685e-03   8.285  < 2e-16 ***
## HIV.AIDS                        -6.889e-01  1.996e-02 -34.516  < 2e-16 ***
## log(GDP)                         5.889e-01  7.086e-02   8.311  < 2e-16 ***
## thinness..1.19.years            -8.489e-02  3.047e-02  -2.786 0.005385 ** 
## Income.composition.of.resources  8.038e+00  8.268e-01   9.722  < 2e-16 ***
## Schooling                        7.919e-01  5.543e-02  14.286  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.642 on 2331 degrees of freedom
## Multiple R-squared:  0.7625, Adjusted R-squared:  0.7613 
## F-statistic: 623.7 on 12 and 2331 DF,  p-value: < 2.2e-16
plot(prelim.model)

vif(prelim.model)
##                         Alcohol                         Measles 
##                        1.547310                        1.394134 
##                             BMI               under.five.deaths 
##                        1.709194                        1.725861 
##                           Polio               Total.expenditure 
##                        1.984334                        1.160591 
##                      Diphtheria                        HIV.AIDS 
##                        1.994255                        1.124632 
##                        log(GDP)            thinness..1.19.years 
##                        1.633008                        1.947947 
## Income.composition.of.resources                       Schooling 
##                        3.076060                        3.498226

Feature Selection Tools: Penalized Regression

# Penalized Regression

#Setting kfold parameters
fitControl<-trainControl(method="repeatedcv",number=5,repeats=1) 

#Fitting glmnet
set.seed(1234)
glmnet.fit<-train(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling,
                  data=training,
                  method="glmnet",
                  trControl=fitControl
)

glmnet.fit
## glmnet 
## 
## 2344 samples
##   15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times) 
## Summary of sample sizes: 1875, 1877, 1874, 1875, 1875 
## Resampling results across tuning parameters:
## 
##   alpha  lambda      RMSE      Rsquared   MAE     
##   0.10   0.01359526  4.546733  0.7715121  3.428469
##   0.10   0.13595255  4.617190  0.7645704  3.477897
##   0.10   1.35952554  4.677957  0.7611292  3.543112
##   0.55   0.01359526  4.546678  0.7715357  3.427825
##   0.55   0.13595255  4.642083  0.7621296  3.494318
##   0.55   1.35952554  4.837724  0.7583217  3.646672
##   1.00   0.01359526  4.547465  0.7714789  3.427827
##   1.00   0.13595255  4.645565  0.7619759  3.493011
##   1.00   1.35952554  5.132045  0.7483316  3.890102
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.01359526.
plot(glmnet.fit)

#Investigating coefficients
opt.pen<-glmnet.fit$finalModel$lambdaOpt #penalty term
coef(glmnet.fit$finalModel,opt.pen)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                            s1
## (Intercept)                      4.460994e+01
## infant.deaths                    7.072859e-02
## Alcohol                          4.933926e-02
## percentage.expenditure           2.592483e-04
## Measles                         -6.563182e-06
## BMI                              5.356624e-02
## under.five.deaths               -5.267115e-02
## Polio                            2.688020e-02
## Total.expenditure                1.322282e-01
## Diphtheria                       4.334536e-02
## HIV.AIDS                        -6.760276e-01
## log(GDP)                         4.439098e-01
## thinness..1.19.years            -9.170910e-02
## thinness.5.9.years               .           
## Income.composition.of.resources  7.354446e+00
## Schooling                        7.755679e-01
glmnet.fit.model <-lm(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years +
              Income.composition.of.resources + Schooling,
          data=training)

plot(glmnet.fit.model)

#Lets force a LASSO model and add complexity
set.seed(1234)
glmnet.fit2<-train(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + poly(Polio,2) + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling,
                  data=training,
                  method="glmnet",
                  trControl=fitControl,
                  tuneGrid=expand.grid(data.frame(alpha=1,lambda=seq(0,.05,.001)))
)

glmnet.fit2
## glmnet 
## 
## 2344 samples
##   15 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 1 times) 
## Summary of sample sizes: 1875, 1877, 1874, 1875, 1875 
## Resampling results across tuning parameters:
## 
##   lambda  RMSE      Rsquared   MAE     
##   0.000   4.438996  0.7821622  3.353113
##   0.001   4.438996  0.7821622  3.353113
##   0.002   4.439178  0.7821469  3.353265
##   0.003   4.439604  0.7821102  3.353665
##   0.004   4.439905  0.7820844  3.354178
##   0.005   4.440391  0.7820411  3.354927
##   0.006   4.441047  0.7819817  3.355913
##   0.007   4.441877  0.7819056  3.356904
##   0.008   4.442881  0.7818127  3.357980
##   0.009   4.444052  0.7817038  3.359160
##   0.010   4.445390  0.7815786  3.360345
##   0.011   4.446942  0.7814329  3.361585
##   0.012   4.448665  0.7812705  3.362960
##   0.013   4.450543  0.7810932  3.364494
##   0.014   4.452643  0.7808933  3.366372
##   0.015   4.454931  0.7806745  3.368319
##   0.016   4.457387  0.7804391  3.370265
##   0.017   4.460005  0.7801876  3.372391
##   0.018   4.462793  0.7799192  3.374614
##   0.019   4.465766  0.7796322  3.376937
##   0.020   4.468895  0.7793297  3.379295
##   0.021   4.472143  0.7790153  3.381720
##   0.022   4.475542  0.7786857  3.384205
##   0.023   4.479080  0.7783420  3.386881
##   0.024   4.482819  0.7779772  3.389618
##   0.025   4.486731  0.7775947  3.392610
##   0.026   4.490825  0.7771933  3.395700
##   0.027   4.495096  0.7767735  3.398865
##   0.028   4.499526  0.7763374  3.402101
##   0.029   4.504118  0.7758847  3.405358
##   0.030   4.508868  0.7754155  3.408641
##   0.031   4.513601  0.7749468  3.411809
##   0.032   4.517809  0.7745270  3.414992
##   0.033   4.522146  0.7740937  3.418897
##   0.034   4.526346  0.7736733  3.422622
##   0.035   4.529756  0.7733298  3.425748
##   0.036   4.533274  0.7729749  3.428874
##   0.037   4.536688  0.7726295  3.431850
##   0.038   4.538484  0.7724495  3.433561
##   0.039   4.539767  0.7723218  3.434705
##   0.040   4.541073  0.7721918  3.435852
##   0.041   4.542062  0.7720926  3.436751
##   0.042   4.542531  0.7720457  3.437187
##   0.043   4.542508  0.7720491  3.437149
##   0.044   4.542487  0.7720524  3.437110
##   0.045   4.542467  0.7720556  3.437071
##   0.046   4.542450  0.7720586  3.437032
##   0.047   4.542435  0.7720615  3.436993
##   0.048   4.542421  0.7720644  3.436953
##   0.049   4.542409  0.7720671  3.436915
##   0.050   4.542399  0.7720696  3.436881
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.001.
plot(glmnet.fit2)

opt.pen<-glmnet.fit2$finalModel$lambdaOpt   #penalty term
coef(glmnet.fit2$finalModel,opt.pen)
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                                            s1
## (Intercept)                      5.044976e+01
## infant.deaths                    1.003575e-01
## Alcohol                          4.928420e-02
## percentage.expenditure           3.159795e-04
## Measles                         -3.689026e-06
## BMI                              5.226929e-02
## under.five.deaths               -7.357662e-02
## poly(Polio, 2)1                  5.473565e+01
## poly(Polio, 2)2                  5.245271e+01
## Total.expenditure                1.330491e-01
## Diphtheria                       2.136060e-02
## HIV.AIDS                        -6.514221e-01
## log(GDP)                         3.653079e-01
## thinness..1.19.years            -1.069419e-01
## thinness.5.9.years               .           
## Income.composition.of.resources  6.837594e+00
## Schooling                        7.006484e-01
# Different way to do GLMNET

x=model.matrix(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling,
               training)[,-1]
y=log(training$Life.expectancy)

library(glmnet)
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-7
set.seed(1234)

grid=10^seq(10,-2, length =100)
lasso.mod=glmnet(x,y,alpha=1, lambda =grid)

cv.out=cv.glmnet(x,y,alpha=1)
plot(cv.out)

bestlambda<-cv.out$lambda.1se
coef(cv.out,s=bestlambda)
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                                            s1
## (Intercept)                      3.844715e+00
## infant.deaths                    3.126175e-04
## Alcohol                          1.412272e-04
## percentage.expenditure           2.088892e-06
## Measles                         -2.066844e-07
## BMI                              8.656727e-04
## under.five.deaths               -2.420026e-04
## Polio                            4.313928e-04
## Total.expenditure                1.324203e-03
## Diphtheria                       7.540932e-04
## HIV.AIDS                        -1.150787e-02
## log(GDP)                         6.791475e-03
## thinness..1.19.years            -8.147405e-04
## thinness.5.9.years               .           
## Income.composition.of.resources  1.152774e-01
## Schooling                        1.190970e-02

Feature Selection Tools: Forward Selection

fwd.selection = lm(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling, data = training)

# Forward
ols_step_forward_p(fwd.selection, penter = 0.05, details = TRUE)
## Forward Selection Method    
## ---------------------------
## 
## Candidate Terms: 
## 
## 1. infant.deaths 
## 2. Alcohol 
## 3. percentage.expenditure 
## 4. Measles 
## 5. BMI 
## 6. under.five.deaths 
## 7. Polio 
## 8. Total.expenditure 
## 9. Diphtheria 
## 10. HIV.AIDS 
## 11. log(GDP) 
## 12. thinness..1.19.years 
## 13. thinness.5.9.years 
## 14. Income.composition.of.resources 
## 15. Schooling 
## 
## We are selecting variables based on p value...
## 
## 
## Forward Selection: Step 1 
## 
## - Schooling 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.742       RMSE                6.376 
## R-Squared               0.550       Coef. Var           9.217 
## Adj. R-Squared          0.550       MSE                40.657 
## Pred R-Squared          0.548       MAE                 4.522 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    116351.776           2      58175.888    1430.894    0.0000 
## Residual       95178.071        2341         40.657                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                             Parameter Estimates                                              
## ------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ------------------------------------------------------------------------------------------------------------
##                     (Intercept)    43.597         0.507                 86.019    0.000    42.603    44.591 
## Income.composition.of.resources    15.377         1.093        0.329    14.063    0.000    13.233    17.521 
##                       Schooling     1.322         0.069        0.450    19.231    0.000     1.187     1.457 
## ------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 2 
## 
## - HIV.AIDS 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.838       RMSE                5.191 
## R-Squared               0.702       Coef. Var           7.504 
## Adj. R-Squared          0.702       MSE                26.948 
## Pred R-Squared          0.700       MAE                 3.727 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    148470.718           3      49490.239    1836.485    0.0000 
## Residual       63059.129        2340         26.948                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    48.062         0.432                 111.146    0.000    47.214    48.910 
## Income.composition.of.resources    11.453         0.897        0.245     12.762    0.000     9.693    13.213 
##                       Schooling     1.267         0.056        0.431     22.619    0.000     1.157     1.376 
##                        HIV.AIDS    -0.750         0.022       -0.402    -34.523    0.000    -0.793    -0.707 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 3 
## 
## - Diphtheria 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.855       RMSE                4.931 
## R-Squared               0.731       Coef. Var           7.128 
## Adj. R-Squared          0.731       MSE                24.314 
## Pred R-Squared          0.729       MAE                 3.605 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    154658.745           4      38664.686    1590.205    0.0000 
## Residual       56871.102        2339         24.314                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    44.282         0.474                  93.387    0.000    43.352    45.212 
## Income.composition.of.resources    10.312         0.855        0.221     12.055    0.000     8.635    11.989 
##                       Schooling     1.129         0.054        0.385     20.962    0.000     1.024     1.235 
##                        HIV.AIDS    -0.722         0.021       -0.387    -34.868    0.000    -0.763    -0.682 
##                      Diphtheria     0.074         0.005        0.186     15.953    0.000     0.065     0.083 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 4 
## 
## - BMI 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.865       RMSE                4.776 
## R-Squared               0.748       Coef. Var           6.904 
## Adj. R-Squared          0.747       MSE                22.813 
## Pred R-Squared          0.746       MAE                 3.544 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    158193.664           5      31638.733    1386.889    0.0000 
## Residual       53336.184        2338         22.813                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    44.336         0.459                  96.525    0.000    43.435    45.237 
## Income.composition.of.resources     9.331         0.832        0.200     11.211    0.000     7.699    10.963 
##                       Schooling     0.970         0.054        0.330     18.043    0.000     0.864     1.075 
##                        HIV.AIDS    -0.690         0.020       -0.370    -34.134    0.000    -0.730    -0.651 
##                      Diphtheria     0.069         0.005        0.174     15.309    0.000     0.060     0.078 
##                             BMI     0.074         0.006        0.154     12.448    0.000     0.062     0.086 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 5 
## 
## - log(GDP) 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.869       RMSE                4.704 
## R-Squared               0.756       Coef. Var           6.799 
## Adj. R-Squared          0.755       MSE                22.125 
## Pred R-Squared          0.753       MAE                 3.506 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    159823.821           6      26637.303    1203.948    0.0000 
## Residual       51706.027        2337         22.125                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    42.099         0.522                  80.644    0.000    41.076    43.123 
## Income.composition.of.resources     8.057         0.833        0.173      9.672    0.000     6.424     9.691 
##                       Schooling     0.872         0.054        0.297     16.098    0.000     0.765     0.978 
##                        HIV.AIDS    -0.686         0.020       -0.368    -34.417    0.000    -0.725    -0.647 
##                      Diphtheria     0.068         0.004        0.171     15.333    0.000     0.060     0.077 
##                             BMI     0.067         0.006        0.140     11.352    0.000     0.055     0.079 
##                        log(GDP)     0.610         0.071        0.111      8.584    0.000     0.471     0.749 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 6 
## 
## - Polio 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.871       RMSE                4.679 
## R-Squared               0.758       Coef. Var           6.763 
## Adj. R-Squared          0.758       MSE                21.889 
## Pred R-Squared          0.756       MAE                 3.491 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                    
## --------------------------------------------------------------------------
##                   Sum of                                                  
##                  Squares          DF    Mean Square       F          Sig. 
## --------------------------------------------------------------------------
## Regression    160397.905           7      22913.986    1046.842    0.0000 
## Residual       51131.943        2336         21.889                       
## Total         211529.848        2343                                      
## --------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    41.581         0.529                  78.597    0.000    40.543    42.618 
## Income.composition.of.resources     7.914         0.829        0.169      9.546    0.000     6.288     9.540 
##                       Schooling     0.855         0.054        0.291     15.850    0.000     0.749     0.961 
##                        HIV.AIDS    -0.683         0.020       -0.367    -34.475    0.000    -0.722    -0.644 
##                      Diphtheria     0.050         0.006        0.125      8.740    0.000     0.039     0.061 
##                             BMI     0.066         0.006        0.137     11.195    0.000     0.054     0.077 
##                        log(GDP)     0.604         0.071        0.110      8.539    0.000     0.465     0.742 
##                           Polio     0.029         0.006        0.073      5.121    0.000     0.018     0.041 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 7 
## 
## - percentage.expenditure 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.872       RMSE                4.657 
## R-Squared               0.761       Coef. Var           6.732 
## Adj. R-Squared          0.760       MSE                21.689 
## Pred R-Squared          0.758       MAE                 3.479 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    160886.865           8      20110.858    927.253    0.0000 
## Residual       50642.982        2335         21.689                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    42.827         0.588                  72.787    0.000    41.673    43.980 
## Income.composition.of.resources     7.810         0.826        0.167      9.460    0.000     6.191     9.429 
##                       Schooling     0.839         0.054        0.286     15.592    0.000     0.733     0.944 
##                        HIV.AIDS    -0.685         0.020       -0.367    -34.709    0.000    -0.724    -0.646 
##                      Diphtheria     0.050         0.006        0.127      8.882    0.000     0.039     0.062 
##                             BMI     0.067         0.006        0.139     11.403    0.000     0.055     0.078 
##                        log(GDP)     0.431         0.079        0.078      5.441    0.000     0.276     0.586 
##                           Polio     0.029         0.006        0.074      5.161    0.000     0.018     0.041 
##          percentage.expenditure     0.000         0.000        0.059      4.748    0.000     0.000     0.000 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 8 
## 
## - thinness..1.19.years 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.873       RMSE                4.640 
## R-Squared               0.762       Coef. Var           6.707 
## Adj. R-Squared          0.762       MSE                21.527 
## Pred R-Squared          0.760       MAE                 3.481 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    161286.681           9      17920.742    832.492    0.0000 
## Residual       50243.166        2334         21.527                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    44.180         0.665                  66.433    0.000    42.876    45.484 
## Income.composition.of.resources     7.719         0.823        0.165      9.382    0.000     6.105     9.332 
##                       Schooling     0.814         0.054        0.277     15.106    0.000     0.709     0.920 
##                        HIV.AIDS    -0.680         0.020       -0.364    -34.497    0.000    -0.718    -0.641 
##                      Diphtheria     0.050         0.006        0.125      8.769    0.000     0.039     0.061 
##                             BMI     0.057         0.006        0.118      8.989    0.000     0.044     0.069 
##                        log(GDP)     0.433         0.079        0.079      5.483    0.000     0.278     0.588 
##                           Polio     0.029         0.006        0.074      5.188    0.000     0.018     0.041 
##          percentage.expenditure     0.000         0.000        0.055      4.437    0.000     0.000     0.000 
##            thinness..1.19.years    -0.115         0.027       -0.053     -4.310    0.000    -0.168    -0.063 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## Forward Selection: Step 9 
## 
## - Total.expenditure 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.874       RMSE                4.629 
## R-Squared               0.764       Coef. Var           6.691 
## Adj. R-Squared          0.763       MSE                21.428 
## Pred R-Squared          0.761       MAE                 3.474 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    161538.357          10      16153.836    753.866    0.0000 
## Residual       49991.490        2333         21.428                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    43.422         0.699                  62.084    0.000    42.051    44.794 
## Income.composition.of.resources     7.910         0.823        0.169      9.615    0.000     6.297     9.524 
##                       Schooling     0.795         0.054        0.271     14.704    0.000     0.689     0.901 
##                        HIV.AIDS    -0.685         0.020       -0.367    -34.741    0.000    -0.724    -0.646 
##                      Diphtheria     0.049         0.006        0.122      8.580    0.000     0.037     0.060 
##                             BMI     0.054         0.006        0.113      8.631    0.000     0.042     0.067 
##                        log(GDP)     0.454         0.079        0.083      5.751    0.000     0.299     0.609 
##                           Polio     0.029         0.006        0.073      5.177    0.000     0.018     0.040 
##          percentage.expenditure     0.000         0.000        0.051      4.029    0.000     0.000     0.000 
##            thinness..1.19.years    -0.102         0.027       -0.047     -3.795    0.000    -0.155    -0.049 
##               Total.expenditure     0.145         0.042        0.037      3.427    0.001     0.062     0.228 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables to be added.
## 
## Variables Entered: 
## 
## + Income.composition.of.resources 
## + Schooling 
## + HIV.AIDS 
## + Diphtheria 
## + BMI 
## + log(GDP) 
## + Polio 
## + percentage.expenditure 
## + thinness..1.19.years 
## + Total.expenditure 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.874       RMSE                4.629 
## R-Squared               0.764       Coef. Var           6.691 
## Adj. R-Squared          0.763       MSE                21.428 
## Pred R-Squared          0.761       MAE                 3.474 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    161538.357          10      16153.836    753.866    0.0000 
## Residual       49991.490        2333         21.428                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    43.422         0.699                  62.084    0.000    42.051    44.794 
## Income.composition.of.resources     7.910         0.823        0.169      9.615    0.000     6.297     9.524 
##                       Schooling     0.795         0.054        0.271     14.704    0.000     0.689     0.901 
##                        HIV.AIDS    -0.685         0.020       -0.367    -34.741    0.000    -0.724    -0.646 
##                      Diphtheria     0.049         0.006        0.122      8.580    0.000     0.037     0.060 
##                             BMI     0.054         0.006        0.113      8.631    0.000     0.042     0.067 
##                        log(GDP)     0.454         0.079        0.083      5.751    0.000     0.299     0.609 
##                           Polio     0.029         0.006        0.073      5.177    0.000     0.018     0.040 
##          percentage.expenditure     0.000         0.000        0.051      4.029    0.000     0.000     0.000 
##            thinness..1.19.years    -0.102         0.027       -0.047     -3.795    0.000    -0.155    -0.049 
##               Total.expenditure     0.145         0.042        0.037      3.427    0.001     0.062     0.228 
## -------------------------------------------------------------------------------------------------------------
## 
##                                          Selection Summary                                           
## ----------------------------------------------------------------------------------------------------
##         Variable                                         Adj.                                           
## Step                Entered                R-Square    R-Square      C(p)          AIC         RMSE     
## ----------------------------------------------------------------------------------------------------
##    1    Income.composition.of.resources      0.5500      0.5497    2327.9505    15341.9036    6.3763    
##    2    Schooling                            0.7019      0.7015     755.3715    14378.9334    5.1912    
##    3    HIV.AIDS                             0.7311      0.7307     454.0135    14138.8322    4.9310    
##    4    Diphtheria                           0.7479      0.7473     282.7199    13990.4121    4.7763    
##    5    BMI                                  0.7556      0.7549     204.8041    13919.6529    4.7037    
##    6    log(GDP)                             0.7583      0.7576     178.6605    13895.4823    4.6785    
##    7    Polio                                0.7606      0.7598     156.6900    13874.9594    4.6571    
##    8    percentage.expenditure               0.7625      0.7616     139.0897    13858.3806    4.6397    
##    9    thinness..1.19.years                 0.7637      0.7627     128.7517    13848.6096    4.6290    
##   10    Total.expenditure                        NA          NA           NA            NA        NA    
## ----------------------------------------------------------------------------------------------------
# Forward Selection Chosen Model

fwd.select.model <- lm(Life.expectancy~ Income.composition.of.resources + Schooling +
                         HIV.AIDS + Diphtheria +
                         BMI + log(GDP) + Polio +
                         percentage.expenditure + thinness..1.19.years + Total.expenditure, training)

plot(fwd.select.model)

library(leaps)
## Warning: package 'leaps' was built under R version 4.2.3
reg.fwd=regsubsets(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling,
                   data=training,method="forward",nvmax=15)

summary(reg.fwd)$adjr2
##  [1] 0.5118279 0.6808674 0.7140742 0.7338469 0.7473158 0.7549340 0.7575512
##  [8] 0.7597668 0.7615613 0.7626540 0.7628746 0.7628610 0.7737752 0.7741531
## [15] 0.7740578
summary(reg.fwd)$rss
##  [1] 103218.90  67448.44  60404.40  56203.20  53336.18  51706.03  51131.94
##  [8]  50642.98  50243.17  49991.49  49923.61  49905.07  47587.78  47487.89
## [15]  47487.55
summary(reg.fwd)$bic
##  [1] -1666.334 -2655.919 -2906.706 -3067.922 -3182.891 -3247.891 -3266.302
##  [8] -3281.065 -3291.884 -3295.895 -3291.321 -3284.432 -3388.122 -3385.287
## [15] -3377.544
par(mfrow=c(1,3))
bics<-summary(reg.fwd)$bic
plot(1:15,bics,type="l",ylab="BIC",xlab="# of predictors")
index<-which(bics==min(bics))
points(index,bics[index],col="red",pch=10)

adjr2<-summary(reg.fwd)$adjr2
plot(1:15,adjr2,type="l",ylab="Adjusted R-squared",xlab="# of predictors")
index<-which(adjr2==max(adjr2))
points(index,adjr2[index],col="red",pch=10)

rss<-summary(reg.fwd)$rss
plot(1:15,rss,type="l",ylab="train RSS",xlab="# of predictors")
index<-which(rss==min(rss))
points(index,rss[index],col="red",pch=10)

Backward Selection

bck.selection = lm(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling, data = training)

# Backward
ols_step_backward_p(bck.selection, prem = 0.05, details = TRUE)
## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . infant.deaths 
## 2 . Alcohol 
## 3 . percentage.expenditure 
## 4 . Measles 
## 5 . BMI 
## 6 . under.five.deaths 
## 7 . Polio 
## 8 . Total.expenditure 
## 9 . Diphtheria 
## 10 . HIV.AIDS 
## 11 . log(GDP) 
## 12 . thinness..1.19.years 
## 13 . thinness.5.9.years 
## 14 . Income.composition.of.resources 
## 15 . Schooling 
## 
## We are eliminating variables based on p value...
## 
## - thinness.5.9.years 
## 
## Backward Elimination: Step 1 
## 
##  Variable thinness.5.9.years Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.881       RMSE                4.516 
## R-Squared               0.776       Coef. Var           6.527 
## Adj. R-Squared          0.774       MSE                20.390 
## Pred R-Squared          0.771       MAE                 3.389 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    164041.953          14      11717.282    574.663    0.0000 
## Residual       47487.895        2329         20.390                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    45.159         0.702                  64.284    0.000    43.781    46.536 
##                   infant.deaths     0.112         0.010        1.387     10.864    0.000     0.091     0.132 
##                         Alcohol     0.067         0.030        0.027      2.213    0.027     0.008     0.127 
##          percentage.expenditure     0.000         0.000        0.055      4.452    0.000     0.000     0.000 
##                         Measles     0.000         0.000       -0.005     -0.419    0.675     0.000     0.000 
##                             BMI     0.053         0.006        0.111      8.655    0.000     0.041     0.065 
##               under.five.deaths    -0.083         0.008       -1.404    -10.912    0.000    -0.098    -0.068 
##                           Polio     0.026         0.006        0.065      4.670    0.000     0.015     0.037 
##               Total.expenditure     0.132         0.042        0.034      3.160    0.002     0.050     0.215 
##                      Diphtheria     0.041         0.006        0.103      7.341    0.000     0.030     0.052 
##                        HIV.AIDS    -0.671         0.019       -0.360    -34.460    0.000    -0.710    -0.633 
##                        log(GDP)     0.443         0.077        0.081      5.723    0.000     0.291     0.594 
##            thinness..1.19.years    -0.096         0.030       -0.044     -3.233    0.001    -0.154    -0.038 
## Income.composition.of.resources     7.023         0.809        0.150      8.681    0.000     5.436     8.609 
##                       Schooling     0.768         0.054        0.262     14.235    0.000     0.662     0.874 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## - Measles 
## 
## Backward Elimination: Step 2 
## 
##  Variable Measles Removed 
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.881       RMSE                4.515 
## R-Squared               0.775       Coef. Var           6.526 
## Adj. R-Squared          0.774       MSE                20.383 
## Pred R-Squared          0.772       MAE                 3.389 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    164038.369          13      12618.336    619.074    0.0000 
## Residual       47491.479        2330         20.383                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    45.142         0.701                  64.370    0.000    43.767    46.518 
##                   infant.deaths     0.112         0.010        1.391     10.919    0.000     0.092     0.132 
##                         Alcohol     0.067         0.030        0.027      2.208    0.027     0.007     0.127 
##          percentage.expenditure     0.000         0.000        0.055      4.459    0.000     0.000     0.000 
##                             BMI     0.054         0.006        0.112      8.710    0.000     0.042     0.066 
##               under.five.deaths    -0.083         0.008       -1.411    -11.035    0.000    -0.098    -0.068 
##                           Polio     0.026         0.006        0.065      4.667    0.000     0.015     0.037 
##               Total.expenditure     0.133         0.042        0.034      3.185    0.001     0.051     0.215 
##                      Diphtheria     0.041         0.006        0.103      7.348    0.000     0.030     0.052 
##                        HIV.AIDS    -0.671         0.019       -0.360    -34.464    0.000    -0.709    -0.633 
##                        log(GDP)     0.442         0.077        0.080      5.719    0.000     0.290     0.594 
##            thinness..1.19.years    -0.095         0.030       -0.044     -3.212    0.001    -0.153    -0.037 
## Income.composition.of.resources     7.035         0.808        0.151      8.704    0.000     5.450     8.620 
##                       Schooling     0.767         0.054        0.261     14.231    0.000     0.662     0.873 
## -------------------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables satisfy the condition of p value = 0.05
## 
## 
## Variables Removed: 
## 
## - thinness.5.9.years 
## - Measles 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                          
## --------------------------------------------------------------
## R                       0.881       RMSE                4.515 
## R-Squared               0.775       Coef. Var           6.526 
## Adj. R-Squared          0.774       MSE                20.383 
## Pred R-Squared          0.772       MAE                 3.389 
## --------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                   ANOVA                                   
## -------------------------------------------------------------------------
##                   Sum of                                                 
##                  Squares          DF    Mean Square       F         Sig. 
## -------------------------------------------------------------------------
## Regression    164038.369          13      12618.336    619.074    0.0000 
## Residual       47491.479        2330         20.383                      
## Total         211529.848        2343                                     
## -------------------------------------------------------------------------
## 
##                                              Parameter Estimates                                              
## -------------------------------------------------------------------------------------------------------------
##                           model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## -------------------------------------------------------------------------------------------------------------
##                     (Intercept)    45.142         0.701                  64.370    0.000    43.767    46.518 
##                   infant.deaths     0.112         0.010        1.391     10.919    0.000     0.092     0.132 
##                         Alcohol     0.067         0.030        0.027      2.208    0.027     0.007     0.127 
##          percentage.expenditure     0.000         0.000        0.055      4.459    0.000     0.000     0.000 
##                             BMI     0.054         0.006        0.112      8.710    0.000     0.042     0.066 
##               under.five.deaths    -0.083         0.008       -1.411    -11.035    0.000    -0.098    -0.068 
##                           Polio     0.026         0.006        0.065      4.667    0.000     0.015     0.037 
##               Total.expenditure     0.133         0.042        0.034      3.185    0.001     0.051     0.215 
##                      Diphtheria     0.041         0.006        0.103      7.348    0.000     0.030     0.052 
##                        HIV.AIDS    -0.671         0.019       -0.360    -34.464    0.000    -0.709    -0.633 
##                        log(GDP)     0.442         0.077        0.080      5.719    0.000     0.290     0.594 
##            thinness..1.19.years    -0.095         0.030       -0.044     -3.212    0.001    -0.153    -0.037 
## Income.composition.of.resources     7.035         0.808        0.151      8.704    0.000     5.450     8.620 
##                       Schooling     0.767         0.054        0.261     14.231    0.000     0.662     0.873 
## -------------------------------------------------------------------------------------------------------------
## 
## 
##                                  Elimination Summary                                  
## -------------------------------------------------------------------------------------
##         Variable                            Adj.                                         
## Step         Removed          R-Square    R-Square     C(p)         AIC         RMSE     
## -------------------------------------------------------------------------------------
##    1    thinness.5.9.years      0.7755      0.7742    14.0170    13736.1797    4.5155    
##    2    Measles                 0.7755      0.7742    12.1927    13734.3566    4.5147    
## -------------------------------------------------------------------------------------
# Backward Selection Chosen Model
bck.select.model <- lm(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + 
              Income.composition.of.resources + Schooling, data = training)

plot(bck.select.model)

Stepwise Selection

sw.selection = lm(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling, data = training)
summary(sw.selection)
## 
## Call:
## lm(formula = Life.expectancy ~ infant.deaths + Alcohol + percentage.expenditure + 
##     Measles + BMI + under.five.deaths + Polio + Total.expenditure + 
##     Diphtheria + HIV.AIDS + log(GDP) + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling, 
##     data = training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -26.0853  -2.5875   0.0947   2.6344  18.6763 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      4.515e+01  7.078e-01  63.787  < 2e-16 ***
## infant.deaths                    1.115e-01  1.028e-02  10.853  < 2e-16 ***
## Alcohol                          6.734e-02  3.040e-02   2.215  0.02686 *  
## percentage.expenditure           2.696e-04  6.057e-05   4.451 8.96e-06 ***
## Measles                         -3.885e-06  9.417e-06  -0.413  0.67999    
## BMI                              5.349e-02  6.217e-03   8.604  < 2e-16 ***
## under.five.deaths               -8.275e-02  7.588e-03 -10.906  < 2e-16 ***
## Polio                            2.584e-02  5.535e-03   4.669 3.20e-06 ***
## Total.expenditure                1.327e-01  4.197e-02   3.161  0.00159 ** 
## Diphtheria                       4.087e-02  5.571e-03   7.336 3.03e-13 ***
## HIV.AIDS                        -6.714e-01  1.951e-02 -34.419  < 2e-16 ***
## log(GDP)                         4.431e-01  7.745e-02   5.721 1.20e-08 ***
## thinness..1.19.years            -1.031e-01  6.194e-02  -1.665  0.09611 .  
## thinness.5.9.years               7.969e-03  6.110e-02   0.130  0.89624    
## Income.composition.of.resources  7.023e+00  8.091e-01   8.679  < 2e-16 ***
## Schooling                        7.680e-01  5.398e-02  14.226  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.516 on 2328 degrees of freedom
## Multiple R-squared:  0.7755, Adjusted R-squared:  0.7741 
## F-statistic: 536.1 on 15 and 2328 DF,  p-value: < 2.2e-16
# Stepwise
ols_step_both_p(sw.selection, prem = 0.05, pent = 0.05, details = FALSE)
## 
##                                            Stepwise Selection Summary                                            
## ----------------------------------------------------------------------------------------------------------------
##                                             Added/                   Adj.                                           
## Step               Variable                Removed     R-Square    R-Square      C(p)          AIC         RMSE     
## ----------------------------------------------------------------------------------------------------------------
##    1    Income.composition.of.resources    addition       0.550       0.550    2327.9510    15341.9036    6.3763    
##    2               Schooling               addition       0.702       0.702     755.3720    14378.9334    5.1912    
##    3               HIV.AIDS                addition       0.731       0.731     454.0140    14138.8322    4.9310    
##    4              Diphtheria               addition       0.748       0.747     282.7200    13990.4121    4.7763    
##    5                  BMI                  addition       0.756       0.755     204.8040    13919.6529    4.7037    
##    6               log(GDP)                addition       0.758       0.758     178.6610    13895.4823    4.6785    
##    7                 Polio                 addition       0.761       0.760     156.6900    13874.9594    4.6571    
##    8        percentage.expenditure         addition       0.762       0.762     139.0900    13858.3806    4.6397    
##    9         thinness..1.19.years          addition       0.764       0.763     128.7520    13848.6096    4.6290    
## ----------------------------------------------------------------------------------------------------------------
# Stepwise Selection Chosen Model
sw.select.model = lm(Life.expectancy~ percentage.expenditure + 
                       BMI + Polio + Diphtheria + HIV.AIDS + 
                       log(GDP) + thinness..1.19.years +
                       Income.composition.of.resources + Schooling, data = training)

plot(sw.select.model)

Forward Selection Validation

fwd.train=regsubsets(Life.expectancy~infant.deaths + Alcohol + 
              percentage.expenditure + Measles + BMI + 
              under.five.deaths + Polio + Total.expenditure + 
              Diphtheria + HIV.AIDS + log(GDP) + 
              thinness..1.19.years + thinness.5.9.years +
              Income.composition.of.resources + Schooling,
                    data=training,method="forward",nvmax=15)

#Creating a prediction function 
predict.regsubsets =function (object , newdata ,id ,...){
  form=as.formula (object$call [[2]])
  mat=model.matrix(form ,newdata )
  coefi=coef(object ,id=id)
  xvars=names(coefi)
  mat[,xvars]%*%coefi
}

valMSE<-c()
#note my index, i, is to 15 since that is how many predictors I went up to during fwd selection
for (i in 1:15){
  predictions<-predict.regsubsets(object=fwd.train,newdata=validate,id=i) 
  valMSE[i]<-mean((validate$Life.expectancy-predictions)^2)
}
par(mfrow=c(1,1))
plot(1:15,sqrt(valMSE),type="l",xlab="# of predictors",
     ylab="test vs train RMSE")
index<-which(valMSE==min(valMSE))
points(index,sqrt(valMSE[index]),col="red",pch=10)

trainMSE<-summary(fwd.train)$rss/nrow(training)
lines(1:15,sqrt(trainMSE),lty=3,col="blue")  

Adding Complexity to the Model

# Transformations using Polio as a test

transformations <- training

# Original
training %>% ggplot(aes(x=Polio, y = Life.expectancy)) + 
           geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Transforming with Polynomial using a ^2
transformations$polio2 <- transformations$Polio^2
transformations %>% ggplot(aes(x=polio2, y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Transforming using Log
transformations$logPolio <- log(transformations$Polio)

transformations %>% ggplot(aes(x=logPolio, y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Testing if it's the same as logging inside the plot
transformations %>% ggplot(aes(x=log(Polio), y=Life.expectancy)) +
geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Creating plot functions

# Original Data plot
plot_original <- function(var1) {
  return(training %>% ggplot(aes(x=var1, y = Life.expectancy)) + 
           geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}

plot_original_interaction <- function(var1) {
  return(training %>% ggplot(aes(x=var1, 
                                        y = Life.expectancy, 
                                        color=Status)) + 
           geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}

plot_original(training$Polio)+ 
  xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Creating function to plot variable as polynomials 

plot_poly <- function(var1) {
  return(training %>% ggplot(aes(x=var1^2, 
                                      y = Life.expectancy,
                                      color=Status)) + 
         geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}

plot_poly(training$Polio)+ 
  xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Creating function to plot variables as logged
plot_log <- function(var1) {
  return(training %>% ggplot(aes(x=log(var1), 
                                      y = Life.expectancy, 
                                      color=Status)) + 
         geom_point() + geom_smooth () + ylab("Life Expectancy in Age")
)
}

plot_log(training$Polio) + 
  xlab("Polio Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Putting each Variable into the Function

# Infant Deaths Variable

plot_original(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$infant.deaths) + xlab("Infant Deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 656 rows containing non-finite values (`stat_smooth()`).

# Alcohol Variable

plot_original(training$Alcohol) + 
  xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Alcohol) + 
  xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Alcohol) + 
  xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Alcohol) + 
  xlab("Alcohol Consumption per capita in Litres of pure alcohol")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Percentage of Expenditures Variable

plot_original(training$percentage.expenditure) + 
  xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$percentage.expenditure) + 
  xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$percentage.expenditure) + 
  xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$percentage.expenditure) + 
  xlab("Expenditure on health as a percentage of Gross Domestic Product per capita")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 500 rows containing non-finite values (`stat_smooth()`).

# Measles Variable

plot_original(training$Measles) + xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Measles) + xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Measles)+ xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Measles)+ xlab("Reported Measles cases per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 778 rows containing non-finite values (`stat_smooth()`).

# BMI Variable

plot_original(training$BMI) + xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$BMI) + xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$BMI)+ xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$BMI)+ xlab("Average Body Mass Index of population")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Under Five Deaths Variable

plot_original(training$under.five.deaths) + 
  xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$under.five.deaths) + 
  xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$under.five.deaths)+ 
  xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$under.five.deaths)+ 
  xlab("Under age 5 deaths per 1000")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 604 rows containing non-finite values (`stat_smooth()`).

# Total Expenditures Variable

plot_original(training$Total.expenditure) + 
  xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Total.expenditure) + 
  xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Total.expenditure)+ 
  xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Total.expenditure)+ 
  xlab("Percentage of total government expenditures on health")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Diphtheria Variable

plot_original(training$Diphtheria) + 
  xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Diphtheria) + 
  xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Diphtheria)+ 
  xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Diphtheria)+ 
  xlab("DPT3 Immunization Coverage among 1-year-olds")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# HIV AIDS Variable

plot_original(training$HIV.AIDS) + xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$HIV.AIDS) + xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$HIV.AIDS)+ xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$HIV.AIDS)+ xlab ("HIV/AIDS Deaths per 1000 live births")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# GDP Variable

plot_original(training$GDP) + xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$GDP) + xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$GDP)+ xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$GDP)+ xlab("Gross Domestic Product per capita in USD")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Thinness 10-19 Years Variable

plot_original(training$thinness..1.19.years) + 
  xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$thinness..1.19.years) + 
  xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$thinness..1.19.years)+ 
  xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$thinness..1.19.years)+ 
  xlab("Prevalence of thinness among children ages 10 to 19")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Thinness 5-9 Variable

plot_original(training$thinness.5.9.years)+ 
  xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$thinness.5.9.years)+ 
  xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$thinness.5.9.years)+ 
  xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$thinness.5.9.years)+ 
  xlab("Prevalence of thinness among children ages 5 to 9")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Income Composition of Resources Variable

plot_original(training$Income.composition.of.resources) + 
  xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Income.composition.of.resources) + 
  xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Income.composition.of.resources)+ 
  xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Income.composition.of.resources)+ 
  xlab("Human Development Index in terms of income composition of resources")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 101 rows containing non-finite values (`stat_smooth()`).

# Schooling Variable

plot_original(training$Schooling) + xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_original_interaction(training$Schooling) + xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_poly(training$Schooling)+ xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

plot_log(training$Schooling)+ xlab("Years in School")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 21 rows containing non-finite values (`stat_smooth()`).

Original Model for comparison

original.model <- lm(Life.expectancy~ Polio + Alcohol + BMI +
                       Diphtheria+ Schooling + HIV.AIDS + GDP +
                       thinness..1.19.years + Measles + Total.expenditure + 
                       Income.composition.of.resources, training)

plot(original.model)

Fitting a complex model with polynomials up to 2 & logged variables

poly2logmodel <- lm(Life.expectancy~ poly(Polio,2)+
                      poly(Alcohol,2)+
                      poly(BMI,2)+
                      poly(Diphtheria,2)+
                      poly(Schooling,2)+
                      log(HIV.AIDS) +
                      log(GDP) +
                      log(thinness..1.19.years) +
                      Measles +
                      Total.expenditure+
                      Income.composition.of.resources, training)

plot(poly2logmodel)

Fitting a complex model with polynomials up to 7 & logged variables

poly7logmodel <- lm(Life.expectancy~ poly(Polio,7)+
                      sqrt(Alcohol)+
                      poly(BMI,7)+
                      poly(Diphtheria,5)+
                      poly(Schooling,7)+
                      log(HIV.AIDS) +
                      log(GDP) +
                      log(thinness..1.19.years) +
                      Measles +
                      Total.expenditure+
                      Income.composition.of.resources, training)

plot(poly7logmodel)

Fitting interaction terms on regular model

interaction.model <- lm(Life.expectancy~ Polio:Status+Alcohol:Status+BMI:Status+
                          Diphtheria:Status+Schooling:Status+
                          HIV.AIDS:Status+GDP:Status+
                          thinness..1.19.years:Status+
                          Measles:Status+Total.expenditure:Status+
                          Income.composition.of.resources:Status, training)

plot(interaction.model)

Fitting interaction on poly2logmodel

poly2log.interaction.model <- lm(Life.expectancy~ poly(Polio,2):Status+
                      poly(Alcohol,2):Status+
                      poly(BMI,2):Status+
                      poly(Diphtheria,2):Status+
                      poly(Schooling,2):Status+
                      log(HIV.AIDS):Status +
                      log(GDP):Status +
                      log(thinness..1.19.years):Status +
                      Measles:Status +
                      Total.expenditure:Status+
                      Income.composition.of.resources:Status, training)

plot(poly2log.interaction.model)

Fitting interaction on poly7logmodel

poly7log.interaction.model <- lm(Life.expectancy~ poly(Polio,7):Status+
                      sqrt(Alcohol):Status+
                      poly(BMI,7):Status+
                      poly(Diphtheria,5):Status+
                      poly(Schooling,7):Status+
                      log(HIV.AIDS):Status +
                      log(GDP):Status +
                      log(thinness..1.19.years):Status +
                      Measles:Status +
                      Total.expenditure:Status+
                      Income.composition.of.resources:Status, training)

plot(poly7log.interaction.model)

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Fitting a complex model with polynomials up to 7 & logged variables and categorical variable Status

status.poly7logmodel <- lm(Life.expectancy~ Status + poly(Polio,7)+
                      sqrt(Alcohol)+
                      poly(BMI,7)+
                      poly(Diphtheria,5)+
                      poly(Schooling,7)+
                      log(HIV.AIDS) +
                      log(GDP) +
                      log(thinness..1.19.years) +
                      Measles +
                      Total.expenditure+
                      Income.composition.of.resources, training)

plot(status.poly7logmodel)

Simple Model

simple <- lm(Life.expectancy ~ HIV.AIDS + Alcohol:Status + Total.expenditure, training)
plot(simple)

Predictions

# RMSE of Full Model
full.model_Preds = predict(full.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = full.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.485943
# RMSE of Preliminary Model
prelim.model_Preds = predict(prelim.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = prelim.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.553895
# RMSE of Penalized Regression Model
glmnet.fit.model_Preds = predict(glmnet.fit.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = glmnet.fit.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.40931
# RMSE of Forward Selection Model
fwd.select.model_Preds = predict(fwd.select.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = fwd.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.528587
# RMSE of Backward Selection Model
bck.select.model_Preds = predict(bck.select.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = bck.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.412429
# RMSE of Stepwise Selection Model
sw.select.model_Preds = predict(sw.select.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = sw.select.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.521186
# RMSE of Original Model
original.model_Preds = predict(original.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = original.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.548685
# RMSE of Poly2Log Model
poly2logmodel_Preds = predict(poly2logmodel, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = poly2logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.80083
# RMSE of Poly7Log Model

poly7logmodel_Preds = predict(poly7logmodel, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = poly7logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.734417
# RMSE of Original with Interaction Model
interaction.model_Preds = predict(interaction.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 4.369254
# RMSE of Poly2Log Model with interactions
poly2log.interaction.model_Preds = predict(poly2log.interaction.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = poly2log.interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.671729
# RMSE of Poly7Log Model with interactions
poly7log.interaction.model_Preds = predict(poly7log.interaction.model, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = poly7log.interaction.model_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.533711
# RMSE of Poly7Log Model with Categorical variable Status added (not as interaction)
status.poly7logmodel_Preds = predict(status.poly7logmodel, newdata = validate)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = status.poly7logmodel_Preds)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 3.702815

KNN Model Predictions

library(caret)
fit_cont1 = trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(1364)

knnfit1 = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + GDP, data =training, method = "knn", trControl = fit_cont1, tuneGrid = expand.grid(k = c(1:30)))

plot(knnfit1)

updateval = validate[,c("Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria", "HIV.AIDS", "thinness..1.19.years","thinness.5.9.years",  "Income.composition.of.resources", "Schooling", "GDP")]

prediction = predict(knnfit1, newdata = updateval)

MSPE = data.frame(Observed = validate$Life.expectancy, 
                  Predicted = prediction)
MSPE$Residual = MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual = MSPE$Residual^2
sqrt(mean(MSPE$SquaredResidual))
## [1] 5.595665
#cf = confusionMatrix(prediction,updateval$Life.expectancy))
#cf